####
#### Purpose: Main and supplemental analysis 
#### Author: Kelsey Shoub (kshoub@umass.edu)
#### Last Updated: Oct. 3, 2023
####

############## 1. Setting it up. ##############

# Opening the packages needed to run this R script. 
library(dplyr)
library(ggplot2)
library(texreg)

# Setting the working directory.
setwd("~/Desktop/PolicingTStops/DrivingWhileDifferent/Replication Files/")

# Loading in the cleaned and compiled data file from step one. 
load("Data/CA.RData")

# Additional cleaning. 
ca = ca %>% mutate(
        ask_consent2 = ifelse(search_type == "Discretionary: Consent Search", 1, ask_consent),
        give_consent2 = ifelse(search_type == "Discretionary: Consent Search", 1, give_consent),
        ask_consent3 = ifelse(ask_consent2==1 & search_type != "Discretionary: Consent Search", NA, 
                              ask_consent2),
        give_consent3 = ifelse(ask_consent2==1 & search_type != "Discretionary: Consent Search", -2, 
                               give_consent2),
        school_stop = ifelse(REASON_FOR_STOP==5,1,school_stop),
        stops = 1,
        stop_type = ifelse(REASON_FOR_STOP == 1 & 
                                   investigatory_stop == 0,0,
                           ifelse(REASON_FOR_STOP==1&
                                          investigatory_stop==1,1,
                                  REASON_FOR_STOP)),
        warrant = ifelse(is.na(BFS_SEARCH_WARRANT),0,BFS_SEARCH_WARRANT),
        search_occur2 = ifelse(search_occur == 0, 0, 
                               ifelse(search_occur==1 & search_type=="No Search",NA,
                                      ifelse(search_occur==1&warrant==1,NA,1)))
        )

# Agencies by year
table(ca$school_stop)
table(ca$REASON_FOR_STOP %in% c(3,4))

# Filtering the data to the stops that meet the scope conditions. 
ca = ca %>% filter(school_stop==0 & agency_type == "Police" & 
                           AGENCY_NAME != "LAUSD PD-GRAND" & 
                           stop_type != 6 & 
                           REASON_FOR_STOP!=3 & 
                           REASON_FOR_STOP!=4)

ca = ca %>% mutate(
        dupes = duplicated(ca$DOJ_RECORD_ID),
        to_drop = ifelse(dupes == T & PERSON_NUMBER!=1,1,0)
        ) %>%
        filter(to_drop == 0)

# Intersectional coding of race and gender. 
ca$race.gender = ifelse(ca$G_FULL==1, 
                        ifelse(ca$race_recode == 0,0, 
                               ifelse(ca$race_recode == 2,2, 
                                      ifelse(ca$race_recode == 3,4, 
                                             ifelse(ca$race_recode == 1,6, 
                                                    ifelse(ca$race_recode == 4,8,10))))),
                        NA)
ca$race.gender = ifelse(ca$G_FULL==2, 
                        ifelse(ca$race_recode == 0,1, 
                               ifelse(ca$race_recode == 2,3, 
                                      ifelse(ca$race_recode == 3,5, 
                                             ifelse(ca$race_recode == 1,7, 
                                                    ifelse(ca$race_recode == 4,9,11))))),
                        ca$race.gender)

# Providing labels and transforming into a factor variable. 
ca$stop_type = factor(ca$stop_type,
                      levels=c(0,1,2),
                      labels=c("Traffic Safety",
                               "Pretextual Traffic Stop",
                               "Reasonable Suspicion"))
ca$race.gender = factor(ca$race.gender,
                        levels = 0:11,
                        labels = c("White Man","White Woman",
                                   "Black Man","Black Woman",
                                   "Latino Man","Latina Woman",
                                   "Asian Man","Asian Woman",
                                   "ME/SA Man","ME/SA Woman",
                                   "Other Race Man","Other Race Woman"))

############## 2. Figure and counts that appear in the text. ##############

# Contra Sum
ca$contra_sum = rowSums(ca[,21:30],na.rm=T)

# Search Regressions 
search.pd = lm(search_occur2 ~  
                       factor(race_recode) + factor(G_FULL) + AGE + 
                       LIMITED_ENGLISH_FLUENCY + disability + LGBT +
                       investigatory_stop +
                       dow + factor(hod) + factor(month) + factor(year) +
                       AGENCY_NAME,
               data = ca %>% filter(agency_type == "Police"))

search.pd.contra = lm(contraband_found ~  
                              factor(race_recode) + factor(G_FULL) + AGE + 
                              LIMITED_ENGLISH_FLUENCY + disability + LGBT +
                              investigatory_stop +
                              dow + factor(hod) + factor(month) + factor(year) +
                              AGENCY_NAME,
                      data = ca %>% filter(agency_type == "Police" & 
                                                   search_occur2 == 1))

# Total observations: 1986584
nrow(ca)

# Complete observations: 1984003 
nrow(ca)-length(search.pd$na.action)

# Number + Percent of Searches
ca_searchoccured = table(ca$search_occur2)
ca_searchoccured[2]
prop.table(ca_searchoccured)[2]

# Number + Percent Contraband Found
ca_contaoccured = table(ca$contraband_found[ca$search_occur2==1])
ca_contaoccured[2]
prop.table(ca_contaoccured)[2]

#Coef Plots
coef.names = c("Intercept",
               "Asian","Black","Hispanic",
               "Middle Eastern/South Asian","Native American",
               "Pacific Islander","Multiracial",
               "Female","Transgender Man/Boy",
               "Transgender Woman/Girl",
               "Gender Noncomforming",
               "Age","Limited English Fluency",
               "Disability","LGBT",
               "Pretextual Stop",
               "DOW: Monday","DOW: Saturday","DOW: Sunday","DOW: Thursday",
               "DOW: Tuesday","DOW: Wednesday",paste("HOD:",1:23),
               paste("Month:",2:12),"Year: 2019","Year: 2020",
               "Agency: Davis","Agency: Fresno PD","Agency: LAPD",  
               "Agency: Long Beach PD","Agency: Oakland PD",
               "Agency: Sacramento PD","Agency: San Diego PD",
               "Agency: San Francisco PD",
               "Agency: San Jose PD")
coef.type = c(1,
              rep(2,7),
              rep(3,4),
              rep(4,4),
              rep(5,52))

lm.search.table = summary(search.pd)$coefficients
lm.contra.table = summary(search.pd.contra)$coefficients

lm.contra.table = as.data.frame(lm.contra.table)
lm.contra.table$Names = coef.names
lm.contra.table$Coef.Type = coef.type
lm.contra.table$include = c(0,rep(1,15),0,rep(0,51))
lm.contra.table$Outcome = 2
lm.search.table = as.data.frame(lm.search.table)
lm.search.table$Names = coef.names
lm.search.table$Coef.Type = coef.type
lm.search.table$include = c(0,rep(1,15),0,rep(0,51))
lm.search.table$Outcome = 1

lm.table = bind_rows(lm.search.table,lm.contra.table)
lm.table$p.value = ifelse(lm.table$`Pr(>|t|)`<0.05,1,2)
lm.table$p.value = factor(lm.table$p.value, 
                          levels=1:2,
                          labels=c("Yes","No"))
lm.table$lower = lm.table$Estimate - 1.96 * lm.table$`Std. Error`
lm.table$upper = lm.table$Estimate + 1.96 * lm.table$`Std. Error`
lm.table$Outcome = factor(lm.table$Outcome,
                          levels=1:2,
                          labels=c("Model 1\nSearch Occurred?",
                                   "Model 2\nContraband Found?"))
lm.table$Coef.Type = factor(lm.table$Coef.Type,
                            levels=1:5,
                            labels=c("Intercept","Race","Gender",
                                     "Additional","Misc"))

pdf("Figures/CoefPlot2.pdf",9.99,5.96)
ggplot(lm.table %>% filter(include == 1), 
       aes(Estimate,Names,shape=p.value,color=p.value)) + 
        geom_point(size = 3) + 
        geom_pointrange(aes(xmin=lower, xmax=upper)) +
        geom_vline(xintercept = 0,linetype="dashed") +
        theme_bw(base_size=15)+
        facet_grid(Coef.Type~Outcome,scales = "free",space="free_y") +
        guides(shape=guide_legend(title="P < 0.05?"),
               color=guide_legend(title="P < 0.05?")) + 
        scale_color_grey() +
        theme(legend.position = "bottom")
dev.off()

pdf("Figures/CoefPlot2.pdf",9.99,5.96)
ggplot(lm.table %>% filter(include == 1), 
       aes(Estimate,Names,shape=p.value,color=p.value)) + 
        geom_point(size = 3) + 
        geom_pointrange(aes(xmin=lower, xmax=upper)) +
        geom_vline(xintercept = 0,linetype="dashed") +
        theme_bw(base_size=15)+
        facet_grid(Coef.Type~Outcome,scales = "free",space="free_y") +
        guides(shape=guide_legend(title="P < 0.05?"),
               color=guide_legend(title="P < 0.05?")) + 
        scale_color_grey() +
        theme(legend.position = "bottom")
dev.off()

############## 3. Appendix: Summary Information ##############

ca$stops = 1

# Agencies by year
overall.df = data.frame("Year" = 2018:2020,
                        "Agencies" = c(paste(unique(ca$AGENCY_NAME[ca$year==2018]),collapse = ", "),
                                       paste(unique(ca$AGENCY_NAME[ca$year==2019]),collapse = ", "),
                                       paste(unique(ca$AGENCY_NAME[ca$year==2020]),collapse = ", ")),
                        "N Police" = c(length(unique(ca$AGENCY_NAME[ca$year==2018&ca$agency_type=="Police"])),
                                       length(unique(ca$AGENCY_NAME[ca$year==2019&ca$agency_type=="Police"])),
                                       length(unique(ca$AGENCY_NAME[ca$year==2020&ca$agency_type=="Police"]))),
                        "Stops" = c(sum(ca$stops[ca$year==2018],na.rm=T),
                                    sum(ca$stops[ca$year==2019],na.rm=T),
                                    sum(ca$stops[ca$year==2020],na.rm=T)),
                        "Searches" = c(sum(ca$search_occur[ca$year==2018],na.rm=T),
                                       sum(ca$search_occur[ca$year==2019],na.rm=T),
                                       sum(ca$search_occur[ca$year==2020],na.rm=T)))

xtable::xtable(overall.df)

# Totals
stops.df = ca %>%
        group_by(agency_type) %>%
        summarise(Race.White = sum(ifelse(race_recode == 0,1,0),na.rm=T),
                  Race.Asian = sum(ifelse(race_recode == 1,1,0),na.rm=T),
                  Race.Black = sum(ifelse(race_recode == 2,1,0),na.rm=T),
                  Race.Hsp = sum(ifelse(race_recode == 3,1,0),na.rm=T),
                  Race.ME = sum(ifelse(race_recode == 4,1,0),na.rm=T),
                  Race.NA = sum(ifelse(race_recode == 5,1,0),na.rm=T),
                  Race.PI = sum(ifelse(race_recode == 6,1,0),na.rm=T),
                  Race.Multi = sum(ifelse(race_recode == 8,1,0),na.rm=T),
                  Gender.Male = sum(ifelse(G_FULL == 1,1,0),na.rm=T),
                  Gender.Female = sum(ifelse(G_FULL == 2,1,0),na.rm=T),
                  Gender.TMale = sum(ifelse(G_FULL == 3,1,0),na.rm=T),
                  Gender.TFemale = sum(ifelse(G_FULL == 4,1,0),na.rm=T),
                  Gender.Non = sum(ifelse(G_FULL == 5,1,0),na.rm=T),
                  Disability1 = sum(ifelse(disability==0,1,0),na.rm=T), 
                  Disability2 = sum(ifelse(disability==1,1,0),na.rm=T), 
                  Fluency0 = sum(ifelse(LIMITED_ENGLISH_FLUENCY==0,1,0),na.rm=T),
                  Fluency1 = sum(ifelse(LIMITED_ENGLISH_FLUENCY==1,1,0),na.rm=T),
                  LGBT1 = sum(ifelse(LGBT==0,1,0),na.rm=T),
                  LGBT2 = sum(ifelse(LGBT==1,1,0),na.rm=T),
                  Age = mean(AGE,na.rm=T)) %>% ungroup()

# Search Rates
searches.df = ca %>%
        group_by(agency_type) %>%
        summarise(Race.White = sum(ifelse(race_recode == 0 & search_occur2==1,1,0),na.rm=T),
                  Race.Asian = sum(ifelse(race_recode == 1& search_occur2==1,1,0),na.rm=T),
                  Race.Black = sum(ifelse(race_recode == 2& search_occur2==1,1,0),na.rm=T),
                  Race.Hsp = sum(ifelse(race_recode == 3& search_occur2==1,1,0),na.rm=T),
                  Race.ME = sum(ifelse(race_recode == 4& search_occur2==1,1,0),na.rm=T),
                  Race.NA = sum(ifelse(race_recode == 5& search_occur2==1,1,0),na.rm=T),
                  Race.PI = sum(ifelse(race_recode == 6& search_occur2==1,1,0),na.rm=T),
                  Race.Multi = sum(ifelse(race_recode == 8& search_occur2==1,1,0),na.rm=T),
                  Gender.Male = sum(ifelse(G_FULL == 1& search_occur2==1,1,0),na.rm=T),
                  Gender.Female = sum(ifelse(G_FULL == 2& search_occur2==1,1,0),na.rm=T),
                  Gender.TMale = sum(ifelse(G_FULL == 3& search_occur2==1,1,0),na.rm=T),
                  Gender.TFemale = sum(ifelse(G_FULL == 4& search_occur2==1,1,0),na.rm=T),
                  Gender.Non = sum(ifelse(G_FULL == 5& search_occur2==1,1,0),na.rm=T),
                  Disability1 = sum(ifelse(disability==0& search_occur2==1,1,0),na.rm=T), 
                  Disability2 = sum(ifelse(disability==1& search_occur2==1,1,0),na.rm=T), 
                  Fluency0 = sum(ifelse(LIMITED_ENGLISH_FLUENCY == 0& search_occur2==1,1,0),na.rm=T),
                  Fluency1 = sum(ifelse(LIMITED_ENGLISH_FLUENCY == 1& search_occur2==1,1,0),na.rm=T),
                  LGBT1 = sum(ifelse(LGBT == 0& search_occur2==1,1,0),na.rm=T),
                  LGBT2 = sum(ifelse(LGBT == 1& search_occur2==1,1,0),na.rm=T),
                  Age = mean(AGE[search_occur2==1],na.rm=T)) %>% ungroup()

xtable::xtable(cbind(t(stops.df[,2:21]),
                     t(searches.df[,2:21]),
                     c((t(searches.df[,2:20])/t(stops.df[,2:20]))*100,NA)),round=2)

############## 4. Appendix: Regression Table -- LPM ############## 

texreg(list(search.pd,search.pd.contra),
       digits = 3,
       omit.coef = "dow*|hod*|month*|year*|AGENCY*|investig*",
       custom.coef.names = c("Intercept",
                             "Race: Asian","Race: Black","Race: Hispanic",
                             "Race: Middle Eastern/South Asian","Race: Native American",
                             "Race: Pacific Islander","Race: Multiracial",
                             "Gender: Female","Gender: Transgender Man/Boy",
                             "Gender: Transgender Woman/Girl",
                             "Gender: Gender Noncomforming",
                             "Age","Limited English Fluency",
                             "Disability","LGBT"),
       stars=0.05)

############## 5. Appendix: Regressions -- Logit ##############

# Search Regressions 
search.pd2 = glm(search_occur2 ~  
                 factor(race_recode) + factor(G_FULL) + AGE + 
                 LIMITED_ENGLISH_FLUENCY + disability + LGBT +
                 investigatory_stop +
                 dow + factor(hod) + factor(month) + factor(year) +
                 AGENCY_NAME,
               data = ca %>% filter(agency_type == "Police"),
               family="binomial")

search.pd.contra2 = glm(contraband_found ~  
                        factor(race_recode) + factor(G_FULL) + AGE + 
                        LIMITED_ENGLISH_FLUENCY + disability + LGBT +
                        investigatory_stop +
                        dow + factor(hod) + factor(month) + factor(year) +
                        AGENCY_NAME,
                      data = ca %>% filter(agency_type == "Police" &
                                                   search_occur2 == 1),
                      family="binomial")

texreg(list(search.pd2,search.pd.contra2),
       digits = 3,
       omit.coef = "dow*|hod*|month*|year*|AGENCY*|investig*",
       custom.coef.names = c("Intercept",
                             "Race: Asian","Race: Black","Race: Hispanic",
                             "Race: Middle Eastern/South Asian","Race: Native American",
                             "Race: Pacific Islander","Race: Multiracial",
                             "Gender: Female","Gender: Transgender Man/Boy",
                             "Gender: Transgender Woman/Girl",
                             "Gender: Gender Noncomforming",
                             "Age","Limited English Fluency",
                             "Disability","LGBT"),
       stars=0.05)
